--- layout: post title: "User Location Validation" date: 2020-07-21 18:46:30 -0500 categories: jekyll update --- User Location

User Location Validation

What follows is an exploration of the validity of self-reported user location data in tweets. While only around 5% of the decahose have this data in a machine processable format (roughly matching the format city, state, e.g. Burlington, VT), we hope to show that this subset is a reliable proxy for a user's real world location.

We find that generally user self-reported location is reliable with ~80% of user's having a median tweet location within 50km of their self reported cities. While this is only true for user's that actually provide a location which can be fuzzy-string matched to a city, state pair, around 5% of tweets do have this feature.

In [1]:
import os
import sys
import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import subprocess
from multiprocessing import Pool
sys.path.append(f'{os.getenv("HOME")}/tweet_query/src/')
from itertools import combinations
from tweet_db_query import tweet_connect, ambient_ngrams_dataframe, rank_divergence, extract_timeseries, get_ambient_ngrams, save_ambient_timeseries, top_n_timeseries, assemble_ambient_ngrams
from pymongo import MongoClient
from measurements import compute_pca
import dill
sys.path.append(f'{os.getenv("HOME")}/.passwords')
import mongo_password
sys.path.append(f'{os.getenv("HOME")}/ngram_query')
import mongo_query
from cenpy import products
import cenpy
import geopandas as gpd
import plotly.figure_factory as ff
from pprint import pprint
from matplotlib.lines import Line2D
import warnings
import plotly
import plotly.express as px
In [3]:
import matplotlib
font = {'family' : 'normal',
        'weight' :'normal',
        'size'   : 22}

matplotlib.rc('font', **font)
matplotlib.rcParams['agg.path.chunksize'] = 10000
In [4]:
sys.path.append(f'{os.getenv("HOME")}/tweet_utils/src')
import utils as tweet_utils
sys.path.append(f'{os.getenv("HOME")}/storywrangler/src/')
from regexr import get_emojis_parser, get_ngrams_parser, filter_text, remove_whitespaces, ngrams
import counter
In [135]:
def db_selection(day, high_res=False, ambient='mental_health'):
    db = f'tweets_segmented_location'
    collection = f"{day.year}"
    
    return db, collection

def tweets_per_day(day, high_res=False, tweets=None):
    if tweets is None:
        db,collection = db_selection(day, high_res)
        tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
    
    query = {'tweet_created_at':{"$gte": day ,"$lt": day+relativedelta(days=+1)}}
    
    return tweets.count_documents(query)

def tweets_per_state(state, year, tweets=None):
    day = datetime.datetime(year,1,1)
    if tweets is None:
        db,collection = db_selection(day)
        tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
    
    query = {'state': state}
    
    return tweets.count_documents(query)

def tweets_per_city(city, year, tweets=None):
    day = datetime.datetime(year,1,1)
    if tweets is None:
        db,collection = db_selection(day)
        tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
    
    query = {'city_state': {'$regex': f"^{city}"}}
    
    return tweets.count_documents(query)

Census Population by State

How well does the number of tweets per state correlate with state population? Skip to here for plots.

In [5]:
state_codes = {
    'WA': '53', 'DE': '10', 'DC': '11', 'WI': '55', 'WV': '54', 'HI': '15',
    'FL': '12', 'WY': '56', 'NJ': '34', 'NM': '35', 'TX': '48',
    'LA': '22', 'NC': '37', 'ND': '38', 'NE': '31', 'TN': '47', 'NY': '36',
    'PA': '42', 'AK': '02', 'NV': '32', 'NH': '33', 'VA': '51', 'CO': '08',
    'CA': '06', 'AL': '01', 'AR': '05', 'VT': '50', 'IL': '17', 'GA': '13',
    'IN': '18', 'IA': '19', 'MA': '25', 'AZ': '04', 'ID': '16', 'CT': '09',
    'ME': '23', 'MD': '24', 'OK': '40', 'OH': '39', 'UT': '49', 'MO': '29',
    'MN': '27', 'MI': '26', 'RI': '44', 'KS': '20', 'MT': '30', 'MS': '28',
    'SC': '45', 'KY': '21', 'OR': '41', 'SD': '46'
}
state_pop = {
    'WA': '53', 'DE': '10', 'DC': '11', 'WI': '55', 'WV': '54', 'HI': '15',
    'FL': '12', 'WY': '56', 'NJ': '34', 'NM': '35', 'TX': '48',
    'LA': '22', 'NC': '37', 'ND': '38', 'NE': '31', 'TN': '47', 'NY': '36',
    'PA': '42', 'AK': '02', 'NV': '32', 'NH': '33', 'VA': '51', 'CO': '08',
    'CA': '06', 'AL': '01', 'AR': '05', 'VT': '50', 'IL': '17', 'GA': '13',
    'IN': '18', 'IA': '19', 'MA': '25', 'AZ': '04', 'ID': '16', 'CT': '09',
    'ME': '23', 'MD': '24', 'OK': '40', 'OH': '39', 'UT': '49', 'MO': '29',
    'MN': '27', 'MI': '26', 'RI': '44', 'KS': '20', 'MT': '30', 'MS': '28',
    'SC': '45', 'KY': '21', 'OR': '41', 'SD': '46'
}
us_state_abbrev = {
    'AL': 'Alabama',
    'AK': 'Alaska',
    'AZ': 'Arizona',
    'AR': 'Arkansas',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'IA': 'Iowa',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'MD': 'Maryland',
    'MA': 'Massachusetts',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MS': 'Mississippi',
    'MO': 'Missouri',
    'MT': 'Montana',
    'NE': 'Nebraska',
    'NV': 'Nevada',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NY': 'New York',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VT': 'Vermont',
    'VA': 'Virginia',
    'WA': 'Washington',
    'WV': 'West Virginia',
    'WI': 'Wisconsin',
    'WY': 'Wyoming',
    'DC': "Washington, DC"
}
In [ ]:
# get population data
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for key,value in state_codes.items():
        state = products.ACS(2017).from_state(us_state_abbrev[key], level='county',
                                      variables='B01001_001E')
    
        state_pop[key] = state['B01001_001E'].sum()

# get tweet count data
for key,value in state_codes.items():
    tweet_count = tweets_per_state(key,2009)
    state_codes[key] = tweet_count

# save data
df = pd.DataFrame([(key, state_pop[key],state_codes[key]) for key,value in state_codes.items()],columns=['State','Population','Tweets'])
df.to_csv('tweet_data')

Tweet Scaling with Population

In [7]:
df1 = pd.read_csv('tweet_data')
fig = px.scatter(df1, x="Population", y="Tweets",
                  hover_data=['State'],title='User Location by State: 2009')
fig.update_traces( marker=dict(size=9,  
                               line=dict(width=2,
                               color='DarkSlateGrey')))
fig.update_layout(xaxis_type="log", yaxis_type="log")
plotly.offline.plot(fig, filename='tweets_per_state')

fig.show()
/home/michael/anaconda3/envs/geopandas/lib/python3.8/site-packages/plotly/offline/offline.py:562: UserWarning:

Your filename `tweets_per_state` didn't end with .html. Adding .html to the end of your file.

We can see that states with larger populations also tend to have a higher number of tweets per capita.

In [9]:
df1['tweets_per_capita'] = df1['Tweets']/df1['Population']
fig = px.scatter(df1, x="Population", y='tweets_per_capita',
                  hover_data=['State'],title='User Location by State: 2009')
fig.update_traces( marker=dict(size=9,  
                               line=dict(width=2,
                               color='DarkSlateGrey')))
fig.update_layout(xaxis_type="log", yaxis_type="log")
plotly.offline.plot(fig, filename='tweets_per_captia')
fig.show()
/home/michael/anaconda3/envs/geopandas/lib/python3.8/site-packages/plotly/offline/offline.py:562: UserWarning:

Your filename `tweets_per_captia` didn't end with .html. Adding .html to the end of your file.

Mapping Tweets per Capita

In [12]:
# merge data with shapefile
gdf = gpd.read_file('/home/michael/Downloads/state_shapefile/states.dbf')
gdf = gdf.merge(df1, left_on='STATE_ABBR',right_on='State')

# plot
figsize=(20,9)
gdf[gdf['State']!='DC'].plot(column='tweets_per_capita',figsize=figsize, legend=True, cmap='Blues')
plt.title('Tweets Per Capita: 2009')
plt.axis('off')
findfont: Font family ['normal'] not found. Falling back to DejaVu Sans.
Out[12]:
(-183.78001471754627,
 -61.40685490357737,
 16.297563892643026,
 74.03045784641098)
findfont: Font family ['normal'] not found. Falling back to DejaVu Sans.

Mapping Happiness

Now that we have a tool to query tweets by state, let's map the average happiness by state.

In [13]:
import pathlib
import counter
from tweet_db_query import *
from sentiment import *
def get_ngrams_by_state(state, scheme=1, lang='en', collection='2009', database='tweets_segmented_location'):
    """function to query n-gram counts by fuzzy string matched user-provided location
    :param state: state to query
    :param scheme: length of n-gram to parse
    :param lang: language to query
    :param collection: mongo collection to query
    :param database: mongo database to query
    :return: a counter object of n-grams
    """

    ngrams_pth = f'../../ngrams.bin'
    ngrams_parser = get_ngrams_parser(ngrams_pth)

    tweets = tweet_connect('guest','roboctopus', collection=collection, database=database)



    counter_i = counter.NgramCounter({})
    for t in tweets.find({'state':state}):
        counter_i += parse_ngrams_tweet(t, ngrams_parser, scheme)
    return counter_i
In [15]:
# load labMT happiness dictionaries
happ_dict = {}
happs = pd.read_csv('/home/michael/labMT2english.txt',sep='\t')
happs = happs.set_index('word')
word2score_ref = happs['happs'].to_dict()

# query tweets by state and score sentiment
for state in gdf['State']:
    counter_i = get_ngrams_by_state(state)
    sentiment_value_i = counter_sentiment(counter_i, word2score_ref)
    happ_dict[state] = sentiment_value_i
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-15-c6c3f441ca3d> in <module>
      5 word2score_ref = happs['happs'].to_dict()
      6 for state in gdf['State']:
----> 7     counter_i = get_ngrams_by_state(state)
      8     sentiment_value_i = counter_sentiment(counter_i, word2score_ref)
      9     happ_dict[state] = sentiment_value_i

<ipython-input-13-8e5ff71fe61b> in get_ngrams_by_state(state, scheme, lang, collection, database)
     21 
     22     counter_i = counter.NgramCounter({})
---> 23     for t in tweets.find({'state':state}):
     24         counter_i += parse_ngrams_tweet(t, ngrams_parser, scheme)
     25     return counter_i

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/cursor.py in next(self)
   1154         if self.__empty:
   1155             raise StopIteration
-> 1156         if len(self.__data) or self._refresh():
   1157             if self.__manipulate:
   1158                 _db = self.__collection.database

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/cursor.py in _refresh(self)
   1091                                     self.__max_await_time_ms,
   1092                                     self.__exhaust_mgr)
-> 1093             self.__send_message(g)
   1094 
   1095         return len(self.__data)

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/cursor.py in __send_message(self, operation)
    951 
    952         try:
--> 953             response = client._run_operation_with_response(
    954                 operation, self._unpack_response, exhaust=self.__exhaust,
    955                 address=self.__address)

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/mongo_client.py in _run_operation_with_response(self, operation, unpack_res, exhaust, address)
   1340                 unpack_res)
   1341 
-> 1342         return self._retryable_read(
   1343             _cmd, operation.read_preference, operation.session,
   1344             address=address,

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/mongo_client.py in _retryable_read(self, func, read_pref, session, address, retryable, exhaust)
   1462                         # not support retryable reads, raise the last error.
   1463                         raise last_error
-> 1464                     return func(session, server, sock_info, slave_ok)
   1465             except ServerSelectionTimeoutError:
   1466                 if retrying:

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/mongo_client.py in _cmd(session, server, sock_info, slave_ok)
   1332 
   1333         def _cmd(session, server, sock_info, slave_ok):
-> 1334             return server.run_operation_with_response(
   1335                 sock_info,
   1336                 operation,

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/server.py in run_operation_with_response(self, sock_info, operation, set_slave_okay, listeners, exhaust, unpack_res)
    115             if send_message:
    116                 sock_info.send_message(data, max_doc_size)
--> 117                 reply = sock_info.receive_message(request_id)
    118             else:
    119                 reply = sock_info.receive_message(None)

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/pool.py in receive_message(self, request_id)
    644                                    self.max_message_size)
    645         except BaseException as error:
--> 646             self._raise_connection_failure(error)
    647 
    648     def _raise_if_not_writable(self, unacknowledged):

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/pool.py in receive_message(self, request_id)
    641         """
    642         try:
--> 643             return receive_message(self.sock, request_id,
    644                                    self.max_message_size)
    645         except BaseException as error:

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/network.py in receive_message(sock, request_id, max_message_size)
    194     # Ignore the response's request id.
    195     length, _, response_to, op_code = _UNPACK_HEADER(
--> 196         _receive_data_on_socket(sock, 16))
    197     # No request_id for exhaust cursor "getMore".
    198     if request_id is not None:

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/network.py in _receive_data_on_socket(sock, length)
    253         while bytes_read < length:
    254             try:
--> 255                 chunk_length = sock.recv_into(mv[bytes_read:])
    256             except (IOError, OSError) as exc:
    257                 if _errno_from_exception(exc) == errno.EINTR:

KeyboardInterrupt: 
In [48]:
# Only run if necessary. querying takes a long time.
rerun = False
if rerun:
    gdf.to_file('2009_states_happiness')
In [16]:
# reload file: Run
gdf = gpd.read_file('2009_states_happiness')
In [17]:
figsize=(30,7)
gdf[gdf['State']!='DC'].plot(column='happs',figsize=figsize, legend=True, cmap='Blues')
plt.title('Tweet Sentiment by State: 2009')
plt.axis('off')
Out[17]:
(-183.78001471754624, -61.40685490357737, 16.29756389264303, 74.03045784641098)

City Mentions Timeseries and Network

One major upgrade made possible by the Mongo database is querying for tweets containing anchor words. Using aggregations we can now quickly get timeseries of the number of tweets containing a word in a city or state

In [24]:
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=False)
zipcodes = search.by_state(state='New York',zipcode_type=None, returns=10000)

# count population of New York
x = 0
for i in zipcodes:
    if i.population:
        x += i.population
x
Out[24]:
19378077
In [89]:
def get_city_mentions_by_city(word, scheme=1, lang='en', collection='2009', database='tweets_segmented_location'):
    """Aggregate mentions of a word by city
    :param word: ambient anchor word
    :param dates: a pandas DatetimeIndex array
    :param scheme: length of n-gram to parse
    :param lang: language to query
    :param collection: mongo collection to query
    :return: a counter object of n-grams
    """

    ngrams_pth = f'../../ngrams.bin'
    ngrams_parser = get_ngrams_parser(ngrams_pth)

    tweets = tweet_connect('guest','roboctopus', collection=collection, database=database)



    for t in tweets.aggregate([{'$match': {'$text':{'$search': f'\"{word}\"'}}},
                               {'$group':{
                                   "_id":{
                                       'city_state': "$city_state"
                                   },
                                   'count': {"$sum":1}
                               }}
                              ]):
        yield t

        
def get_weekly_mentions_by_city(word, scheme=1, lang='en', collection='2009', database='tweets_segmented_location'):
    """Aggregate mentions of a word by city
    :param word: ambient anchor word
    :param dates: a pandas DatetimeIndex array
    :param scheme: length of n-gram to parse
    :param lang: language to query
    :param collection: mongo collection to query
    :return: a counter object of n-grams
    """

    ngrams_pth = f'../../ngrams.bin'
    ngrams_parser = get_ngrams_parser(ngrams_pth)

    tweets = tweet_connect('guest','roboctopus', collection=collection, database=database)



    for t in tweets.aggregate([{'$match': {'$text':{'$search': f'\"{word}\"'}}},
                               {'$group':{
                                   "_id":{
                                       'city_state': "$city_state",
                                       'week': {'$week':'$tweet_created_at'}
                                   },
                                   'count': {"$sum":1}
                               }}
                              ]):
        yield t

def get_language_by_city(city='New York, NY', collection='2009', database='tweets_segmented_location'):
    """Aggregate languages by city
    :param city: A city name in {city, state abbr} format
    :param collection: mongo collection to query
    :return: a counter object of n-grams
    """

    ngrams_pth = f'../../ngrams.bin'
    ngrams_parser = get_ngrams_parser(ngrams_pth)

    tweets = tweet_connect('guest','roboctopus', collection=collection, database=database)



    for t in tweets.aggregate([{'$match': {'city_state':city}},
                               {'$group':{
                                   "_id":{
                                       'fastText_lang': "$fastText_lang"
                                   },
                                   'count': {"$sum":1}
                               }}
                              ]):
        print(t)
        yield t

        
def plot_mentions(x, city='New York, NY', word='coronavirus', ax=None):
    """ Function to plot mentions of anchor word"""
    x1 = [i for i in sorted(x, key = lambda i: i['count'], reverse=True) if i['_id']['city_state']==city]
    weeks = []
    counts = []
    for week in x1:
        weeks.append(week['_id']['week'])
        counts.append(week['count'])
    weeks2 = pd.date_range(datetime.datetime(2020,1,1),datetime.datetime(2020,3,25),freq='w')
    if ax is None:
        f,ax = plt.subplots(figsize=(8,6))
    ax.plot(weeks,counts,'o')
    ax.set_title(city)
    ax.set_xlabel('Week of the Year')
    #ax.set_ylabel(f'Tweets with "{word}"')
    
    
def city_name(name):
    try:
        return name.split(',')[0]
    except:
        return None
        pass

Let's see how attention to the coronavirus evolved in time in some major US cities.

In [110]:
word = 'coronavirus'
x = [i for i in get_weekly_mentions_by_city(word, collection='2020')]
In [106]:
x[:10]
Out[106]:
[{'_id': {'city_state': 'Okeechobee, FL', 'week': 10}, 'count': 1},
 {'_id': {'city_state': 'Key West, FL', 'week': 5}, 'count': 1},
 {'_id': {'city_state': 'Hamilton, GA', 'week': 9}, 'count': 1},
 {'_id': {'city_state': 'Paramount, CA', 'week': 3}, 'count': 1},
 {'_id': {'city_state': 'Warren, NJ', 'week': 5}, 'count': 1},
 {'_id': {'city_state': 'Afton, TN', 'week': 1}, 'count': 1},
 {'_id': {'city_state': 'Magnolia, AR', 'week': 10}, 'count': 1},
 {'_id': {'city_state': 'Jber, AK', 'week': 8}, 'count': 1},
 {'_id': {'city_state': 'Haddonfield, NJ', 'week': 9}, 'count': 1},
 {'_id': {'city_state': 'Rossville, GA', 'week': 1}, 'count': 1}]
In [111]:
cities = ['Los Angeles, CA',
'New York, NY',
'Houston, TX',
'Washington, DC',
'Chicago, IL',
'Atlanta, GA',
'Dallas, TX',
'Miami, FL',
'New Middletown, IN',
'Austin, TX',
'Boston, MA',
'Seattle, WA',
'Brooklyn, NY',
'San Francisco, CA',
'San Diego, CA',
'Las Vegas, NV',
'Philadelphia, PA',
'San Antonio, TX',
'Portland, OR',
'Phoenix, AZ',
'Denver, CO',
'Orlando, FL',
'Hanna, UT',
'Baltimore, MD',
'Pittsburgh, PA',]

f,ax = plt.subplots(4,5,figsize=(19,15),sharey=True,sharex=True)
ax = ax.ravel()
for i,city in enumerate(cities[:20]):
    plot_mentions(x,city=city, ax=ax[i])
plt.tight_layout()
plt.suptitle(f'"{word}" Mentions ', y=1.04, fontsize=36)
Out[111]:
Text(0.5, 1.04, '"coronavirus" Mentions ')

Some obvious next steps here would be to correct for population density. With the full year it would be interesting to see how attention scales with recent case numbers, and it different cities have maximum attention at different times.

In [113]:
import networkx as nx
cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix',
       'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose',
       'Austin', 'Jacksonville', 'Fort Worth', 'San Francisco',
       'Columbus', 'Charlotte', 'Indianapolis', 'Seattle', 'Denver',
       'Washington Dc', 'El Paso', 'Boston', 'Nashville', 'Nashville',
       'Portland', 'Las Vegas', 'Detroit', 'Oklahoma City', 'Memphis',
       'Louisville', 'Louisville', 'Baltimore', 'Milwaukee',
       'Albuquerque', 'Tucson', 'Fresno', 'Sacramento', 'Mesa', 'Atlanta',
       'Kansas City', 'Colorado Springs', 'Miami', 'Raleigh',
       'Long Beach', 'Virginia Beach', 'Omaha', 'Oakland', 'Minneapolis',
       'Arlington', 'Tampa', 'Tulsa', 'New Orleans', 'Wichita',
       'Bakersfield', 'Cleveland', 'Aurora', 'Anaheim', 'Honolulu',
       'Riverside', 'Santa Ana', 'Lexington', 'Lexington',
       'Corpus Christi', 'Henderson', 'Stockton', 'St. Paul',
       'Pittsburgh', 'St. Louis', 'Cincinnati', 'Anchorage', 'Orlando',
       'Irvine', 'Plano', 'Greensboro', 'Lincoln', 'Newark', 'Durham',
       'Toledo', 'St. Petersburg', 'Chula Vista', 'Fort Wayne',
       'Scottsdale', 'Jersey City', 'Laredo', 'Madison', 'Lubbock',
       'Reno', 'Chandler', 'Glendale', 'Buffalo', 'North Las Vegas',
       'Gilbert', 'Winston Salem', 'Chesapeake', 'Irving', 'Norfolk',
       'Fremont', 'Hialeah', 'Richmond', 'Boise', 'Boise', 'Garland',
       'Baton Rouge', 'Spokane', 'Tacoma', 'Modesto', 'San Bernardino',
       'Fontana', 'Des Moines', 'Oxnard', 'Moreno Valley', 'Birmingham',
       'Fayetteville', 'Rochester', 'Amarillo', 'Port St. Lucie',
       'Yonkers', 'Mckinney', 'Grand Prairie', 'Salt Lake City',
       'Grand Rapids', 'Little Rock', 'Huntsville', 'Huntington Beach',
       'Augusta', 'Augusta', 'Overland Park', 'Montgomery', 'Tempe',
       'Akron', 'Cape Coral', 'Tallahassee', 'Frisco', 'Mobile',
       'Knoxville', 'Shreveport', 'Brownsville', 'Worcester',
       'Santa Clarita', 'Sioux Falls', 'Fort Lauderdale', 'Vancouver',
       'Rancho Cucamonga', 'Chattanooga', 'Newport News', 'Ontario',
       'Providence', 'Elk Grove', 'Salem', 'Oceanside', 'Santa Rosa',
       'Corona', 'Eugene', 'Garden Grove', 'Peoria', 'Pembroke Pines',
       'Fort Collins', 'Cary', 'Springfield', 'Jackson', 'Alexandria',
       'Hayward', 'Hollywood', 'Lakewood', 'Lancaster', 'Salinas',
       'Sunnyvale', 'Palmdale', 'Clarksville', 'Escondido', 'Pomona',
       'Pasadena', 'Killeen', 'Macon Bibb County', 'Joliet',
       'Murfreesboro', 'Mcallen', 'Savannah', 'Naperville', 'Paterson',
       'Thornton', 'Bellevue', 'Torrance', 'Rockford', 'Miramar',
       'Bridgeport', 'Mesquite', 'Fullerton', 'Denton', 'Waco',
       'Syracuse', 'Roseville', 'Orange', 'Surprise', 'Dayton',
       'Charleston', 'Olathe', 'Midland', 'West Valley City',
       'Gainesville']
In [115]:
G = nx.DiGraph()
for city in cities:
    
    city_dict = [G.add_edge(city_name(i['_id']['city_state']),city, weight=i['count']) for i in get_city_mentions_by_city(city)]
    
In [116]:
[(city_name(i['_id']['city_state']), i['count']) for i in get_city_mentions_by_city('Rome')]
Out[116]:
[('Clearwater', 2),
 ('Moorpark', 1),
 ('Atherton', 2),
 ('Mount Horeb', 1),
 ('Menomonie', 1),
 ('Anchorage', 5),
 ('Lanham', 1),
 ('Mystic', 1),
 ('Tustin', 2),
 ('Venice', 1),
 ('Middleburg', 1),
 ('Sedona', 1),
 ('Orange', 1),
 ('Phoenix', 23),
 ('Little Rock', 5),
 ('Cleveland', 14),
 ('Washington', 53),
 ('Aurora', 3),
 ('Provo', 1),
 ('Sapulpa', 1),
 ('Williamstown', 1),
 ('Leesburg', 3),
 ('Bentonville', 1),
 ('North Richland Hills', 1),
 ('Gray', 1),
 ('Daly City', 5),
 ('Des Plaines', 1),
 ('Gate City', 1),
 ('Encinitas', 4),
 ('Hardwick', 1),
 ('Playa Vista', 4),
 ('Gaithersburg', 1),
 ('Union City', 1),
 ('Baltimore', 19),
 ('Saint Charles', 3),
 ('Saint Petersburg', 3),
 ('Gerton', 1),
 ('Barboursville', 1),
 ('Knoxville', 12),
 ('Toledo', 4),
 ('Waukesha', 1),
 ('Topeka', 2),
 ('Adrian', 1),
 ('Statesboro', 2),
 ('San Juan', 5),
 ('Normal', 1),
 ('Bakersfield', 1),
 ('Greenville', 3),
 ('Ketchum', 1),
 ('Summerville', 3),
 ('Cuyahoga Falls', 4),
 ('Rea', 1),
 ('Blacksburg', 1),
 ('Tracy', 2),
 ('Lima', 1),
 ('Jasper', 1),
 ('Marion', 1),
 ('Seal Beach', 3),
 ('Abington', 1),
 ('Matteson', 1),
 ('Hartford', 17),
 ('Linn', 1),
 ('Chicago', 52),
 ('Santa Clarita', 2),
 ('Utica', 10),
 ('Attleboro', 1),
 ('Madison', 1),
 ('Manti', 1),
 ('Fairmont', 1),
 ('Hattiesburg', 2),
 ('Las Vegas', 11),
 ('Gainesville', 2),
 ('Downers Grove', 1),
 ('East Hartford', 2),
 ('Corona', 1),
 ('Pensacola', 3),
 ('Queens Village', 1),
 ('Petaluma', 1),
 ('Walkersville', 1),
 ('Troy', 2),
 ('Fort Wayne', 3),
 ('Eugene', 2),
 ('Largo', 1),
 ('Westmoreland', 1),
 ('Elyria', 2),
 ('Naperville', 3),
 ('Mcdonough', 1),
 ('Manchester', 1),
 ('Adamsville', 1),
 ('Tempe', 4),
 ('Hoboken', 1),
 ('Muskegon', 1),
 ('Durham', 5),
 ('Reston', 2),
 ('Milpitas', 1),
 ('Spring', 1),
 ('Norcross', 1),
 ('Butler', 1),
 ('Bennington', 2),
 ('Virginia Beach', 1),
 ('Long Beach', 7),
 ('Bethesda', 2),
 ('Martin', 1),
 ('Palo Alto', 4),
 ('Pompano Beach', 1),
 ('Houghton', 1),
 ('Maryland Heights', 1),
 ('Quincy', 1),
 ('North Port', 1),
 ('Tampa', 20),
 ('Alexandria', 5),
 ('Las Cruces', 1),
 ('Ogden', 5),
 ('Sebastopol', 2),
 ('Doylestown', 1),
 ('Iowa City', 1),
 ('Wilkes Barre', 1),
 ('San Francisco', 74),
 ('Cleveland', 1),
 ('Garden Grove', 1),
 ('Lakeland', 1),
 ('Norfolk', 5),
 ('La Jolla', 1),
 ('Rayne', 1),
 ('East Lansing', 9),
 ('Lancaster', 2),
 ('Caldwell', 1),
 ('Sherwood', 1),
 ('Westford', 2),
 ('Studio City', 1),
 ('Portsmouth', 2),
 ('Auburn', 3),
 ('Waterbury Center', 1),
 ('Marietta', 4),
 ('Many', 1),
 ('Oceanside', 3),
 ('Evansville', 1),
 ('Granville', 1),
 ('Roanoke', 2),
 ('Denton', 2),
 ('Rock Island', 1),
 ('Santa Ana', 2),
 ('Hayden', 1),
 (None, 4669),
 ('Chesterfield', 1),
 ('Sherman Oaks', 3),
 ('San Mateo', 1),
 ('West Barnstable', 2),
 ('El Paso', 4),
 ('Flemington', 1),
 ('Guntersville', 1),
 ('Denver', 21),
 ('Kent', 1),
 ('Bodega Bay', 1),
 ('Duluth', 2),
 ('Columbus', 2),
 ('Bay Shore', 3),
 ('La Canada Flintridge', 3),
 ('Snohomish', 1),
 ('Plano', 3),
 ('Jonesboro', 1),
 ('Castle Rock', 2),
 ('Destin', 1),
 ('Olympia', 3),
 ('Bradenton', 1),
 ('Tahlequah', 1),
 ('West Lafayette', 1),
 ('Dedham', 1),
 ('La Verne', 1),
 ('Huntington Beach', 3),
 ('Tinley Park', 2),
 ('Cornelius', 1),
 ('Fairmont', 1),
 ('North Adams', 1),
 ('Smithville', 1),
 ('Dumont', 1),
 ('Escondido', 1),
 ('Spotsylvania', 1),
 ('Joliet', 2),
 ('Saint Cloud', 1),
 ('Litchfield', 1),
 ('Coram', 1),
 ('Wichita', 3),
 ('Chaska', 1),
 ('Compton', 1),
 ('Portsmouth', 1),
 ('Warner Robins', 1),
 ('Pasadena', 4),
 ('Nashua', 1),
 ('Kennesaw', 3),
 ('Glenwood', 1),
 ('Columbia', 12),
 ('New Britain', 2),
 ('Green Bay', 2),
 ('Kent', 2),
 ('Guaynabo', 1),
 ('Prescott', 2),
 ('Tiffin', 1),
 ('Malvern', 1),
 ('Alhambra', 1),
 ('Burlington', 1),
 ('Seattle', 47),
 ('Fort Worth', 13),
 ('Flushing', 1),
 ('Eagar', 1),
 ('Richfield', 6),
 ('Rome', 212),
 ('Livonia', 1),
 ('Venice', 1),
 ('Mckinleyville', 1),
 ('El Mirage', 1),
 ('Poplar', 1),
 ('Gilbert', 4),
 ('Cartersville', 6),
 ('Napa', 1),
 ('Jerome', 9),
 ('Huntington', 1),
 ('North Las Vegas', 1),
 ('Nashville', 24),
 ('Ladera Ranch', 1),
 ('Rancho Santa Fe', 1),
 ('Abbeville', 1),
 ('Elk Park', 1),
 ('Round Rock', 1),
 ('Colchester', 1),
 ('Pomona', 2),
 ('Warren', 2),
 ('Springfield', 5),
 ('Austin', 70),
 ('Commerce Township', 1),
 ('Williamsburg', 6),
 ('Albany', 1),
 ('Beaver Dam', 1),
 ('Lake City', 2),
 ('Camas', 1),
 ('North Canton', 1),
 ('Texarkana', 1),
 ('Nice', 1),
 ('Keene', 11),
 ('Decatur', 1),
 ('Worcester', 1),
 ('Kissimmee', 1),
 ('Loris', 2),
 ('Palm Springs', 2),
 ('Huntington', 1),
 ('Elmhurst', 1),
 ('Monroe', 1),
 ('Sacramento', 12),
 ('Pacific Palisades', 1),
 ('Colonial Heights', 1),
 ('Vancouver', 3),
 ('Marquette', 2),
 ('Dover', 2),
 ('Raleigh', 8),
 ('Athens', 8),
 ('Bartow', 4),
 ('Celina', 1),
 ('Anton', 12),
 ('Greensboro', 6),
 ('Upland', 3),
 ('Nacogdoches', 1),
 ('Los Alamitos', 1),
 ('Hollywood', 1),
 ('Storrs Mansfield', 1),
 ('Cedar Hill', 1),
 ('Rochester', 3),
 ('Kirkland', 1),
 ('Rockford', 1),
 ('Casselberry', 1),
 ('Mount Pleasant', 3),
 ('Silver Spring', 5),
 ('Elizabeth City', 1),
 ('Roseville', 4),
 ('Hempstead', 2),
 ('Mountain View', 2),
 ('Winston', 1),
 ('Eden Prairie', 1),
 ('Dubuque', 3),
 ('Belding', 5),
 ('Houston', 45),
 ('Scranton', 1),
 ('Walpole', 1),
 ('Panama City', 1),
 ('Jersey City', 6),
 ('South Portland', 1),
 ('New Haven', 1),
 ('New York', 82),
 ('Rockville', 2),
 ('Oconomowoc', 1),
 ('Coronado', 1),
 ('Salem', 2),
 ('Bothell', 1),
 ('Shreveport', 3),
 ('Modesto', 1),
 ('Oneida', 3),
 ('Colorado Springs', 1),
 ('Salt Lake City', 10),
 ('Hope', 1),
 ('Lake Elsinore', 1),
 ('Stillwater', 1),
 ('Newport', 3),
 ('Davenport', 2),
 ('Laketown', 8),
 ('Brighton', 2),
 ('Pittsburgh', 19),
 ('Stone Mountain', 1),
 ('Rancho Cordova', 2),
 ('Jefferson', 1),
 ('Pontiac', 1),
 ('Grass Valley', 1),
 ('Pascagoula', 1),
 ('Opp', 1),
 ('Winona', 1),
 ('College Park', 1),
 ('Philadelphia', 13),
 ('Merrimack', 2),
 ('Arlington', 9),
 ('Orlando', 35),
 ('Sonoma', 1),
 ('Emeryville', 2),
 ('Damon', 1),
 ('Makawao', 1),
 ('Oklahoma City', 1),
 ('Atlanta', 2),
 ('Ocala', 1),
 ('Rolla', 1),
 ('Conway', 1),
 ('Orem', 2),
 ('Granbury', 4),
 ('Middletown', 1),
 ('Long Island City', 8),
 ('Youngstown', 3),
 ('Closter', 1),
 ('Greenwood', 2),
 ('Livermore', 2),
 ('Mesa', 2),
 ('Gulfport', 1),
 ('Cocoa', 1),
 ('Decatur', 1),
 ('New Hartford', 1),
 ('Alamogordo', 1),
 ('Center Valley', 1),
 ('Palm Desert', 1),
 ('Oakland', 1),
 ('Boise', 5),
 ('South Bend', 1),
 ('Santa Barbara', 4),
 ('Jacksonville', 19),
 ('Appleton', 2),
 ('Rogue River', 1),
 ('Oak View', 1),
 ('Chapel Hill', 2),
 ('Angelus Oaks', 1),
 ('Kalamazoo', 1),
 ('Cordova', 1),
 ('Beverly Hills', 6),
 ('Mooresville', 1),
 ('Exton', 1),
 ('Woodbine', 2),
 ('Rockwall', 1),
 ('La Crescenta', 2),
 ('Ventura', 2),
 ('Lake Zurich', 1),
 ('Riverton', 1),
 ('Cincinnati', 20),
 ('Monticello', 1),
 ('Bellingham', 3),
 ('Troy', 1),
 ('Memphis', 8),
 ('Melrose', 3),
 ('Charlotte', 17),
 ('Azusa', 1),
 ('Agoura Hills', 1),
 ('Fort Lauderdale', 9),
 ('Vienna', 1),
 ('Oxnard', 1),
 ('Woodland Hills', 1),
 ('Danville', 2),
 ('Waukegan', 1),
 ('Pinellas Park', 1),
 ('Northport', 1),
 ('Arlington', 2),
 ('Ames', 2),
 ('Wilmington', 4),
 ('Charleston', 5),
 ('Breckenridge', 1),
 ('Oxford', 1),
 ('Paradise Valley', 2),
 ('Dickinson', 1),
 ('Conyers', 1),
 ('Deerfield Beach', 1),
 ('Cresco', 2),
 ('Lawrence', 1),
 ('New London', 2),
 ('Bigelow', 1),
 ('Edmond', 2),
 ('Athens', 1),
 ('Ludlow', 2),
 ('Miami', 24),
 ('Stockton', 3),
 ('Evanston', 1),
 ('Brooklyn', 38),
 ('Lexington', 2),
 ('Santa Rosa', 2),
 ('Friendswood', 1),
 ('Versailles', 1),
 ('Jefferson City', 1),
 ('Bristol', 1),
 ('Big Bear Lake', 1),
 ('Lewisburg', 1),
 ('Mobile', 3),
 ('Hanna', 3),
 ('Hermosa Beach', 1),
 ('Salem', 3),
 ('Apopka', 1),
 ('Central Bridge', 1),
 ('Del Rio', 1),
 ('Roscoe', 1),
 ('Euclid', 1),
 ('Malibu', 3),
 ('Deshler', 1),
 ('San Marcos', 1),
 ('Wingate', 1),
 ('Rock', 2),
 ('Monroe', 1),
 ('Hamden', 2),
 ('Middlesboro', 1),
 ('Augusta', 10),
 ('Carbondale', 1),
 ('Spokane', 3),
 ('West Palm Beach', 3),
 ('Redondo Beach', 2),
 ('Merritt Island', 1),
 ('Herriman', 2),
 ('Lexington', 3),
 ('Pleasanton', 1),
 ('Danbury', 1),
 ('Henrico', 2),
 ('Marlborough', 1),
 ('Hutchinson', 1),
 ('Hampton', 1),
 ('Duluth', 3),
 ('Port Charlotte', 2),
 ('Akron', 3),
 ('Los Angeles', 144),
 ('Montgomery', 1),
 ('Norwich', 1),
 ('Bond', 2),
 ('Puyallup', 1),
 ('Reno', 3),
 ('Gardner', 1),
 ('Moscow', 1),
 ('Bellmawr', 1),
 ('Pasadena', 1),
 ('Louisville', 12),
 ('Portland', 39),
 ('Fort Washington', 2),
 ('Columbus', 34),
 ('Astoria', 1),
 ('North Highlands', 1),
 ('Brookline', 1),
 ('Erie', 4),
 ('Temecula', 1),
 ('Davis', 1),
 ('Clemmons', 2),
 ('Menlo Park', 1),
 ('Baton Rouge', 28),
 ('Convent', 1),
 ('Juneau', 1),
 ('Harvard', 1),
 ('Rockaway', 1),
 ('Big Lake', 1),
 ('Lawndale', 1),
 ('Milltown', 1),
 ('Bisbee', 2),
 ('Apex', 1),
 ('Georgetown', 1),
 ('Altamonte Springs', 1),
 ('Wellesley', 1),
 ('Oakland', 8),
 ('Lakewood', 1),
 ('Mohawk', 1),
 ('Wisconsin Rapids', 2),
 ('Sioux City', 2),
 ('West Hollywood', 6),
 ('Honolulu', 34),
 ('Cibolo', 1),
 ('Tuscaloosa', 6),
 ('Beaverton', 1),
 ('Fenton', 1),
 ('Harper', 1),
 ('Simsbury', 4),
 ('Farmington', 1),
 ('Turlock', 1),
 ('Hudson', 1),
 ('Katy', 3),
 ('West Enfield', 1),
 ('Fredericksburg', 4),
 ('Harrietta', 1),
 ('Milan', 1),
 ('Lewiston', 2),
 ('Fayetteville', 1),
 ('Big Flats', 2),
 ('Daytona Beach', 1),
 ('Ash', 3),
 ('Wyncote', 1),
 ('Grants Pass', 6),
 ('Tucson', 7),
 ('Saint Louis', 3),
 ('Chattanooga', 9),
 ('Lorain', 1),
 ('Scottsdale', 10),
 ('Athens', 2),
 ('Bokchito', 2),
 ('Sugar Land', 7),
 ('Sarasota', 4),
 ('Key West', 1),
 ('Nogales', 1),
 ('Ojai', 1),
 ('San Luis Obispo', 2),
 ('Brigham City', 4),
 ('Petersburg', 1),
 ('Glendora', 1),
 ('Lawrenceville', 1),
 ('Newtown', 1),
 ('Missoula', 2),
 ('Folsom', 1),
 ('Poughquag', 1),
 ('Stevens Point', 2),
 ('Vero Beach', 1),
 ('Locust Grove', 1),
 ('Pullman', 1),
 ('Severna Park', 1),
 ('North Hollywood', 13),
 ('Columbia', 2),
 ('Avon Lake', 1),
 ('Charlottesville', 1),
 ('Marina Del Rey', 1),
 ('Tulsa', 7),
 ('Brentwood', 5),
 ('Milwaukee', 24),
 ('Atlanta', 1),
 ('Des Moines', 6),
 ('San Pedro', 2),
 ('Nyack', 2),
 ('Gallup', 1),
 ('Garner', 1),
 ('Fargo', 6),
 ('Asheville', 8),
 ('Dewitt', 1),
 ('Trussville', 1),
 ('Bayonne', 1),
 ('Lafayette', 3),
 ('Laredo', 1),
 ('Orange', 1),
 ('Aliso Viejo', 1),
 ('Ontario', 1),
 ('Waverly', 1),
 ('Frisco', 6),
 ('Middleton', 2),
 ('Cary', 2),
 ('Kingsport', 1),
 ('Indianapolis', 13),
 ('Columbia', 1),
 ('Tupelo', 1),
 ('Laona', 1),
 ('Panorama City', 2),
 ('La Habra', 1),
 ('Nashwauk', 1),
 ('New London', 3),
 ('Moorhead', 1),
 ('Lafayette', 4),
 ('Omaha', 4),
 ('Glen Rose', 1),
 ('Avon', 1),
 ('Harrisburg', 1),
 ('Clovis', 3),
 ('Anderson', 1),
 ('Haverford', 4),
 ('Liverpool', 1),
 ('Huntsville', 6),
 ('Palmdale', 1),
 ('Maple Valley', 1),
 ('Dublin', 1),
 ('Sayreville', 1),
 ('Bend', 2),
 ('Palm Beach', 1),
 ('Valencia', 1),
 ('Fullerton', 1),
 ('Melbourne', 1),
 ('Del Mar', 3),
 ('Madison', 4),
 ('Huguenot', 1),
 ('Los Altos', 3),
 ('Chandler', 3),
 ('Sheboygan', 2),
 ('Tygh Valley', 3),
 ('Santa Fe', 3),
 ('Birmingham', 12),
 ('Flanders', 2),
 ('Hot Springs National Park', 1),
 ('Decatur', 1),
 ('Groton', 1),
 ('White Plains', 2),
 ('Wichita Falls', 3),
 ('California City', 1),
 ('East Petersburg', 5),
 ('Lackland A F B', 1),
 ('Staten Island', 3),
 ('West Des Moines', 1),
 ('Henrietta', 1),
 ('Grove City', 1),
 ('Waterloo', 1),
 ('New Middletown', 3),
 ('Berlin', 1),
 ('El Segundo', 1),
 ('Baldwinsville', 1),
 ('Alpena', 1),
 ('Lansdale', 3),
 ('Mount Clemens', 3),
 ('Lake Mary', 1),
 ('Ty Ty', 2),
 ('La Crosse', 1),
 ('Duarte', 1),
 ('New Bedford', 4),
 ('King Of Prussia', 1),
 ('Toa Baja', 1),
 ('Albuquerque', 5),
 ('Cambridge', 9),
 ('Sunnyvale', 1),
 ('Westminster', 1),
 ('Hemet', 1),
 ('Rome', 70),
 ('Minneapolis', 20),
 ('Berkeley', 6),
 ('Olive Branch', 1),
 ('Winterville', 1),
 ('Greenville', 2),
 ('Winston Salem', 28),
 ('Southfield', 2),
 ('Merced', 2),
 ('Mankato', 1),
 ('Fresno', 6),
 ('Dallas', 40),
 ('Bloomington', 5),
 ('Tacoma', 2),
 ('Huntington', 1),
 ('Douglas', 4),
 ('Newport News', 1),
 ('Glendale', 2),
 ('San Diego', 55),
 ('South Burlington', 1),
 ('Akron', 1),
 ('Costa Mesa', 1),
 ('Eau Claire', 3),
 ('Keysville', 1),
 ('Latham', 1),
 ('Irvine', 4),
 ('Pilot Hill', 2),
 ('Del Rio', 3),
 ('Princeton', 2),
 ('West Sacramento', 1),
 ('Paducah', 1),
 ('Albany', 8),
 ('Atlanta', 91),
 ('Marblehead', 2),
 ('Kansas City', 1),
 ('San Leandro', 2),
 ('Greenville', 1),
 ('Richmond', 1),
 ('Everett', 1),
 ('Waterbury', 2),
 ('Aurora', 2),
 ('Norwalk', 4),
 ('Thomasville', 1),
 ('Dekalb', 1),
 ('Midland', 1),
 ('Pueblo', 1),
 ('Moline', 1),
 ('Morrisville', 1),
 ('San Clemente', 2),
 ('Frederick', 1),
 ('Syracuse', 10),
 ('Orange', 8),
 ('San Antonio', 22),
 ('Grand Rapids', 5),
 ('Bloomington', 3),
 ('Buffalo', 18),
 ('Park City', 3),
 ('Chesaning', 1),
 ('Santa Maria', 1),
 ('Grayslake', 1),
 ('Middletown', 1),
 ('Mount Pleasant', 1),
 ('Amherst', 1),
 ('Franklin', 3),
 ('Lakeview', 1),
 ('Andover', 1),
 ('Wenham', 1),
 ('Visalia', 2),
 ('Canton', 2),
 ('Jacksonville', 1),
 ('Arlington', 2),
 ('Seabrook', 1),
 ('Trion', 1),
 ('Renton', 1),
 ('Moreno Valley', 1),
 ('State College', 1),
 ('Resaca', 1),
 ('Springfield', 1),
 ('Hollywood', 1),
 ('Mount Hood Parkdale', 1),
 ('Cedar Falls', 2),
 ('Camarillo', 1),
 ('Rocky Mount', 1),
 ('Calhoun', 1),
 ('Annapolis Junction', 1),
 ('Bay', 1),
 ('Glendale', 3),
 ('Baraboo', 1),
 ('Dora', 1),
 ('Pittsburg', 1),
 ('Sumter', 1),
 ('Darlington', 1),
 ('Monterey', 2),
 ('College Station', 1),
 ('Flagstaff', 3),
 ('Corvallis', 1),
 ('Camden', 1),
 ('Scott Depot', 2),
 ('Charlo', 1),
 ('Santa Monica', 2),
 ('Ashtabula', 1),
 ('Durango', 1),
 ('Clarion', 1),
 ('Circle Pines', 1),
 ('Windsor', 1),
 ('Northampton', 1),
 ('Wilmore', 1),
 ('Conroe', 1),
 ('Ithaca', 6),
 ('Torrance', 3),
 ('Kansas City', 9),
 ('Boston', 51),
 ('Grayson', 1),
 ('Bridgeport', 1),
 ('Adelanto', 3),
 ('Hanover', 1),
 ('Jackson', 3),
 ('Newport Beach', 4),
 ('Thousand Oaks', 2),
 ('Alexandria', 1),
 ('Bronx', 15),
 ('Manchester', 1),
 ('New Haven', 1),
 ('Rockport', 1),
 ('Dayton', 4),
 ('Hampton', 1),
 ('Dexter', 1),
 ('Tallahassee', 7),
 ('Forks', 21),
 ('Ossining', 1),
 ('Middlebury', 1),
 ('Eagle River', 2),
 ('Cooperstown', 3),
 ('Olathe', 2),
 ('Ann Arbor', 3),
 ('Clinton Township', 1),
 ('Elon', 1),
 ('Mansfield', 1),
 ('Le Roy', 1),
 ('Owatonna', 1),
 ('Oak Park', 1),
 ('Takoma Park', 1),
 ('Polk City', 1),
 ('Colquitt', 1),
 ('Santa Cruz', 4),
 ('Toano', 2),
 ('Boulder', 9),
 ('Redding', 3),
 ('Saint Paul', 2),
 ('Encino', 1),
 ('Champaign', 3),
 ('Ladonia', 1),
 ('Ocean City', 1),
 ('Macon', 1),
 ('Cockeysville', 1),
 ('Little Valley', 1),
 ('Orland', 1),
 ('Providence', 3),
 ('Orono', 2),
 ('Logan', 2),
 ('Savannah', 3),
 ('Schererville', 1),
 ('Newark', 2),
 ('Milford', 1),
 ('Binghamton', 4),
 ('Lake Saint Louis', 26),
 ('Ringgold', 2),
 ('Racine', 1),
 ('Solana Beach', 1),
 ('Woodburn', 1),
 ('Red Bank', 1),
 ('Cedar Rapids', 2),
 ('Canton', 1),
 ('Richmond', 1),
 ('Cupertino', 1),
 ('Arcadia', 1),
 ('Simi Valley', 7),
 ('Montclair', 1),
 ('Hyattsville', 3),
 ('York', 2),
 ('Gardena', 1),
 ('Somerville', 2),
 ('Mckinney', 2),
 ('Westminster', 1),
 ('Redwood City', 1),
 ('Wytheville', 2),
 ('Grosse Pointe', 1),
 ('Hickory', 1),
 ('Lake Charles', 1),
 ('Dunstable', 1),
 ('Clinton', 1),
 ('Green Castle', 1),
 ('Magee', 1),
 ('Miami Beach', 8),
 ('Wenatchee', 1),
 ('Fayetteville', 1),
 ('Hendersonville', 1),
 ('Lincoln', 2),
 ('Hearne', 1),
 ('Fort Hood', 1),
 ('North Zulch', 1),
 ('Lincoln', 1),
 ('Los Alamos', 3),
 ('Newton', 2),
 ('Cape Elizabeth', 1),
 ('Sharon', 1),
 ('Lancaster', 2),
 ('Racine', 3),
 ('Wilmington', 3),
 ('Saint Paul Park', 4),
 ('Lake Forest', 1),
 ('Palm Coast', 3),
 ('Union', 1),
 ('Briggsdale', 1),
 ('Bardstown', 1),
 ('Salisbury', 1),
 ('Irvington', 1),
 ('Detroit', 9),
 ('Mableton', 1),
 ('Russellville', 2),
 ('Brick', 1),
 ('New Castle', 1),
 ('Rochester', 11),
 ('Waterbury', 5),
 ('Baxter', 1),
 ('Covington', 1),
 ('Tyler', 1),
 ('Midland', 1),
 ('East Saint Louis', 1),
 ('New Orleans', 10),
 ('Cedar City', 1),
 ('Georgetown', 1),
 ('Gary', 1),
 ('Cambria Heights', 1),
 ('Myrtle Beach', 1),
 ('Fishers', 1),
 ('Crofton', 1),
 ('Bowling Green', 1),
 ('West Henrietta', 1),
 ('Lansing', 5),
 ('San Jose', 8),
 ('Valdosta', 6),
 ('Cypress', 1),
 ('Bessemer', 1),
 ('Anaheim', 1),
 ('Lubbock', 2),
 ('Cheltenham', 2),
 ('Mc Lean', 3),
 ('Riverside', 4),
 ('Fairfax', 4),
 ('Oxford', 1),
 ('Peabody', 3),
 ('New Baltimore', 1),
 ('Mukilteo', 1),
 ('Richmond', 10)]
In [117]:
len(G.edges())
Out[117]:
115762
In [118]:
G1 = G.subgraph(cities[:80:5])
weights = [np.sqrt(G[u][v]['weight'])/2 for u,v in G1.edges()]
In [119]:
f,ax = plt.subplots(figsize=(20,20))
nx.draw(G1,pos=nx.kamada_kawai_layout(G1,weight=None), ax=ax,with_labels=True,node_size=2500,width=weights)
In [128]:
degree_sequence = sorted([d for n, d in G.degree()], reverse=True)  # degree sequence
# print "Degree sequence", degree_sequence
degreeCount = collections.Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())

fig, ax = plt.subplots(figsize=(10,10))
plt.plot(deg, cnt,'o', color='b')
plt.yscale('log')
plt.xscale('log')

plt.title("Degree Histogram")
plt.ylabel("Count")
plt.xlabel("Degree")
Out[128]:
Text(0.5, 0, 'Degree')
In [129]:
        
cities = pd.read_csv('~/city_fame/data/cities',header=None)
population = pd.read_csv('~/city_fame/data/population',header=None, names=['Population'])
population.index = cities[0].values
#data = city_fame(population,np.median)
In [130]:
population['Tweets'] = 0
In [131]:
population.index
Out[131]:
Index(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix',
       'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose',
       ...
       'Syracuse', 'Roseville', 'Orange', 'Surprise', 'Dayton', 'Charleston',
       'Olathe', 'Midland', 'West Valley City', 'Gainesville'],
      dtype='object', length=200)
In [139]:
for city in population.index:
    tweet_count = tweets_per_city(city,2009)
    population['Tweets'][city] = tweet_count
In [137]:
fig = px.scatter(population, x="Population", y="Tweets",
                  hover_data=[population.index],title='User Location by City: 2009')
fig.update_traces( marker=dict(size=9,  
                               line=dict(width=2,
                               color='DarkSlateGrey')))
fig.update_layout(xaxis_type="log", yaxis_type="log")
plotly.offline.plot(fig, filename='tweets_per_state')

fig.show()
/home/michael/anaconda3/envs/geopandas/lib/python3.8/site-packages/plotly/offline/offline.py:562: UserWarning:

Your filename `tweets_per_state` didn't end with .html. Adding .html to the end of your file.

Verifying User Location

Verifying User Location

In [140]:
def get_user_loc(args):
    """ Check the tweet object and based on the user bio location return a
        state of origin
    :param args: (JSON object, uszipcode SearchEngine)
    :return:  string with "City,State" tweeter is from
    NOTE: if the tweet HAS NO LOCATION (doesn't exist, isn't in US, not in
    correct format, etc) this will return "None"
    also, has_loc returns True if there's a given location, but we can't classify it-
    I was using this for some sanity testing 
    """
    tweet, location_searcher = args
    has_loc = False
    if 'user' in tweet:
        if 'location' in tweet['user']:
            if tweet['user']['location']!=None:
                user_location_text = tweet['user']['location'].split(',')
                if len(user_location_text) == 2:
                    try:
                        user_location = location_searcher.by_city_and_state(user_location_text[0].strip(),\
                                                                            user_location_text[1].strip())[0]
                    except (ValueError, IndexError, KeyError):
                        pass
                    else:
                        has_loc = True

    elif 'location' in tweet['actor']:
        if tweet['actor']['location']!=None:
            user_location_text = tweet['actor']['location']['displayName'].split(',')
            if len(user_location_text) == 2:
                try:
                    user_location = location_searcher.by_city_and_state(user_location_text[0].strip(),\
                                                                        user_location_text[1].strip())[0]
                except (ValueError, IndexError, KeyError):
                    pass
                else:
                    has_loc = True

    if has_loc:
        city_state = f"{user_location.major_city}, {user_location.state}"
        state = user_location.state

        tweet['city_state'] = city_state
        tweet['state'] = state

        return user_location.lng, user_location.lat
    
def get_loc(args):
    """ Check the tweet object and based on the user bio location return a
        state of origin
    :param args: (JSON object, uszipcode SearchEngine)
    :return:  string with "City,State" tweeter is from
    NOTE: if the tweet HAS NO LOCATION (doesn't exist, isn't in US, not in
    correct format, etc) this will return "None"
    also, has_loc returns True if there's a given location, but we can't classify it-
    I was using this for some sanity testing 
    """
    tweet, location_searcher = args
    has_loc = False
    if 'user' in tweet:
        if 'location' in tweet['user']:
            if tweet['user']['location']!=None:
                user_location_text = tweet['user']['location'].split(',')
                if len(user_location_text) == 2:
                    try:
                        user_location = location_searcher.by_city_and_state(user_location_text[0].strip(),\
                                                                            user_location_text[1].strip())[0]
                    except (ValueError, IndexError, KeyError):
                        pass
                    else:
                        has_loc = True

    elif 'location' in tweet['actor']:
        if tweet['actor']['location']!=None:
            user_location_text = tweet['actor']['location']['displayName'].split(',')
            if len(user_location_text) == 2:
                try:
                    user_location = location_searcher.by_city_and_state(user_location_text[0].strip(),\
                                                                        user_location_text[1].strip())[0]
                except (ValueError, IndexError, KeyError):
                    pass
                else:
                    has_loc = True

    if has_loc:
        city_state = f"{user_location.major_city}, {user_location.state}"
        state = user_location.state

        tweet['city_state'] = city_state
        tweet['state'] = state

        return user_location.lng, user_location.lat
In [141]:
from uszipcode import SearchEngine
from shapely.geometry import Point, LineString

location_search = SearchEngine()

db = 'tweets'
collection='geotweets'
tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
In [11]:
user_loc_list = []
loc_list = []
t_list = []
begin = datetime.datetime(2011,1,1)
end = datetime.datetime(2012,1,1)
query= {'tweet_created_at': {'$gte':begin,'$lt':end}}
for t in tweets.find(query,limit=2):
    user_loc = get_user_loc((t,location_search))
    if user_loc and user_loc[0]:
        print(get_user_loc((t,location_search)))
        user_loc_list.append(Point(get_user_loc((t,location_search))))
        loc_list.append(Point(t['geo']['coordinates']))
        t_list.append(t)
In [143]:
user_loc_gdf = gpd.GeoDataFrame(geometry=user_loc_list,crs={'init':'epsg:4326'})
loc_gdf = gpd.GeoDataFrame(geometry=loc_list,crs={'init':'epsg:4326'})
user_loc_gdf.to_crs(epsg=3310,inplace=True)
loc_gdf.to_crs(epsg=3310,inplace=True)
dist = user_loc_gdf.distance(loc_gdf)/1000
bins = np.logspace(0,5,20)

ax = dist.hist(bins=bins)
ax.set_xscale('log')
ax.set_xlabel('Distance from Tweet to User\'s given Home \n[km]')
Out[143]:
Text(0.5, 0, "Distance from Tweet to User's given Home \n[km]")
In [144]:
f = plt.figure(facecolor='w')
ax = dist.hist(bins=bins,density=False,figsize=(12,8),grid=False)
ax.set_xscale('log')
ax.set_xlabel('Distance from Tweet to User\'s given Home \n[km]')
ax.set_title('2011')
Out[144]:
Text(0.5, 1.0, '2011')
In [127]:
ax = dist.hist(bins=bins,density=False,figsize=(12,8),grid=False)
ax.set_xscale('log')
ax.set_xlabel('Distance from Tweet to User\'s given Home \n[km]')
ax.set_title('2010')
Out[127]:
Text(0.5, 1.0, '2010')
In [124]:
ax = dist.hist(bins=bins,density=False,figsize=(12,8),grid=False)
ax.set_xscale('log')
ax.set_xlabel('Distance from Tweet to User\'s given Home \n[km]')
ax.set_title('2009')
Out[124]:
Text(0.5, 1.0, '2009')
In [258]:
user_ids = []
n = 100000
begin = datetime.datetime(2010,1,1)
end = datetime.datetime(2014,1,1)
query= {'tweet_created_at': {'$gte':begin,'$lt':end}}
for t in tweets.find(limit=n):
    user_ids.append(t['user']['id'])
In [259]:
user_ids = set(user_ids)
N = len(user_ids)
In [260]:
# multiprocess user info
def get_user_loc_info(user_id):
    user_dict = {}
    db = 'tweets'
    collection='geotweets'
    tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
    
    # get example tweet # assumes user account location doesn't change
    query = {'user.id':user_id}
    t = tweets.find_one(query)
    
    # find lat long of User defined account location 
    location_search = SearchEngine()
    user_loc = get_user_loc((t,location_search))
    if user_loc and user_loc[0]:
        user_dict['profile_location'] = user_loc
        query = {'user.id':user_id}
        user_dict['lat'] = []
        user_dict['lng'] = []
        for t in tweets.find(query,limit=1000):
            user_dict['tweet_created_at'] = t['tweet_created_at']
            user_dict['lat'].append(t['geo']['coordinates'][0])
            user_dict['lng'].append(t['geo']['coordinates'][1])
        return user_dict
    else:
        return None
    
def get_user_loc_info_local(user_id):
    user_dict = {}
    query = {'user.id':user_id}
    t = tweets.find_one(query)
    
    # find lat long of User defined account location 
    location_search = SearchEngine()
    user_loc = get_user_loc((t,location_search))
    if user_loc and user_loc[0]:
        user_dict['profile_location'] = user_loc
        user_dict['lat'] = []
        user_dict['lng'] = []
        user_dict['tweet_created_at'] = []
        for t in tweets.find(query,limit=1000):
            user_dict['tweet_created_at'].append(t['tweet_created_at'])
            user_dict['lat'].append(t['geo']['coordinates'][0])
            user_dict['lng'].append(t['geo']['coordinates'][1])
        return user_dict
    else:
        return None
    
    
def user_loc_change(user_id):
    query = {'user.id':user_id}
    
    user_loc = ''
    print(user_id,user_loc)
    counter = 0
    for t in tweets.find(query,limit=1000).sort('tweet_created_at',1):
        new_loc = t['user']['location']
        if user_loc != new_loc:
            print(user_id,new_loc, user_loc,t['tweet_created_at'])
            
            counter += 1
            user_loc = new_loc
    return counter
In [245]:
# does user location change?
db = 'tweets'
collection='geotweets'
tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)

# get example tweet # assumes user account location doesn't change


location_search = SearchEngine()

user_info_list = list(map(user_loc_change, user_ids))
222490627 
222490627 jubail  2011-06-24 17:54:47
222490627 Highland Heights,KY jubail 2012-03-10 22:26:28
222490627 jubail❤ /Highland Heights,KY Highland Heights,KY 2012-04-08 06:15:18
222490627 jubail❤ / Wilder,KY jubail❤ /Highland Heights,KY 2012-12-28 18:51:25
330487812 
252637189 
252637189 Salt Lake City, Utah  2011-07-02 17:37:28
221231109 
221231109 Miami  2011-04-01 17:05:04
221231109 Miami Gardens Miami 2011-12-03 21:47:00
26632199 
26632199 Bear,Delaware  2011-04-12 07:22:10
26632199 Delaware  Bear,Delaware 2013-06-11 15:50:12
206551049 
206551049 Tempe AZ  2011-07-05 00:21:52
206551049 Khaldiya - Arizona Tempe AZ 2012-06-15 18:20:57
18735117 
18735117 Cambridge, ON  2010-10-22 01:37:50
18735117 Kelowna, BC Cambridge, ON 2011-06-04 01:34:55
18735117 Hespeler, ON Kelowna, BC 2012-11-26 00:13:11
114894862 
114894862 Pittsburgh, PA  2011-04-04 01:11:06
114894862 New York, NY Pittsburgh, PA 2011-08-20 07:09:31
114894862 West$ide New York, NY 2013-11-20 18:06:27
114894862 Philadelphia, PA West$ide 2013-12-21 13:49:33
88094734 
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-245-ef21e6f27c66> in <module>
      9 location_search = SearchEngine()
     10 
---> 11 user_info_list = list(map(user_loc_change, user_ids))

<ipython-input-242-92028a0f3325> in user_loc_change(user_id)
     54     print(user_id,user_loc)
     55     counter = 0
---> 56     for t in tweets.find(query,limit=1000).sort('tweet_created_at',1):
     57         new_loc = t['user']['location']
     58         if user_loc != new_loc:

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/cursor.py in next(self)
   1154         if self.__empty:
   1155             raise StopIteration
-> 1156         if len(self.__data) or self._refresh():
   1157             if self.__manipulate:
   1158                 _db = self.__collection.database

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/cursor.py in _refresh(self)
   1071                                   self.__session,
   1072                                   self.__collection.database.client)
-> 1073             self.__send_message(q)
   1074         elif self.__id:  # Get More
   1075             if self.__limit:

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/cursor.py in __send_message(self, operation)
    951 
    952         try:
--> 953             response = client._run_operation_with_response(
    954                 operation, self._unpack_response, exhaust=self.__exhaust,
    955                 address=self.__address)

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/mongo_client.py in _run_operation_with_response(self, operation, unpack_res, exhaust, address)
   1340                 unpack_res)
   1341 
-> 1342         return self._retryable_read(
   1343             _cmd, operation.read_preference, operation.session,
   1344             address=address,

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/mongo_client.py in _retryable_read(self, func, read_pref, session, address, retryable, exhaust)
   1462                         # not support retryable reads, raise the last error.
   1463                         raise last_error
-> 1464                     return func(session, server, sock_info, slave_ok)
   1465             except ServerSelectionTimeoutError:
   1466                 if retrying:

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/mongo_client.py in _cmd(session, server, sock_info, slave_ok)
   1332 
   1333         def _cmd(session, server, sock_info, slave_ok):
-> 1334             return server.run_operation_with_response(
   1335                 sock_info,
   1336                 operation,

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/server.py in run_operation_with_response(self, sock_info, operation, set_slave_okay, listeners, exhaust, unpack_res)
    115             if send_message:
    116                 sock_info.send_message(data, max_doc_size)
--> 117                 reply = sock_info.receive_message(request_id)
    118             else:
    119                 reply = sock_info.receive_message(None)

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/pool.py in receive_message(self, request_id)
    644                                    self.max_message_size)
    645         except BaseException as error:
--> 646             self._raise_connection_failure(error)
    647 
    648     def _raise_if_not_writable(self, unacknowledged):

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/pool.py in receive_message(self, request_id)
    641         """
    642         try:
--> 643             return receive_message(self.sock, request_id,
    644                                    self.max_message_size)
    645         except BaseException as error:

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/network.py in receive_message(sock, request_id, max_message_size)
    194     # Ignore the response's request id.
    195     length, _, response_to, op_code = _UNPACK_HEADER(
--> 196         _receive_data_on_socket(sock, 16))
    197     # No request_id for exhaust cursor "getMore".
    198     if request_id is not None:

~/anaconda3/envs/geopandas/lib/python3.8/site-packages/pymongo/network.py in _receive_data_on_socket(sock, length)
    253         while bytes_read < length:
    254             try:
--> 255                 chunk_length = sock.recv_into(mv[bytes_read:])
    256             except (IOError, OSError) as exc:
    257                 if _errno_from_exception(exc) == errno.EINTR:

KeyboardInterrupt: 
In [103]:
f,ax = plt.subplots(figsize=(12,6),facecolor='w')
plt.hist(user_info_list,bins=30)
#ax.set_yscale('log')
plt.xlabel(' Number of Self-reported User Location Updates')
plt.ylabel('Number of Users')
Out[103]:
Text(0, 0.5, 'Number of Users')
In [ ]:
db = 'tweets'
collection='geotweets'
tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)

# get example tweet # assumes user account location doesn't change


location_search = SearchEngine()

user_info_list = list(map(get_user_loc_info_local, user_ids))
WARNING:root:Applied processor reduces input query to empty string, all comparisons will have score 0. [Query: '']
In [262]:
print(n,N,len([i for i in user_info_list if i is not None]))
100000 80002 7625
In [263]:
# filter out users with no matched account location
user_loc_dict = []
for key,value in zip(user_ids,user_info_list):
    if value is not None:      
        for lat,lng,time_stamp in zip(value['lat'],value['lng'],value['tweet_created_at']):  
            user_loc_dict.append( {'profile_location':Point(value['profile_location']),
                                  'user_id':key,
                                  'lat':lat,
                                  'lng':lng,
                                  'tweet_created_at':time_stamp})
        
In [264]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
In [265]:
df = pd.DataFrame(user_loc_dict)

gdf = gpd.GeoDataFrame(df,geometry=gpd.points_from_xy(df.lat, df.lng),crs={'init':'epsg:4326'})
gdf['profile_location'] = gpd.GeoSeries(gdf['profile_location'])
/home/michael/anaconda3/envs/geopandas/lib/python3.8/site-packages/pyproj/crs/crs.py:53: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  return _prepare_from_string(" ".join(pjargs))
In [ ]:
 
In [180]:
ax = world.plot(color='white', edgecolor='black',figsize=(20,8))

# We can now plot our ``GeoDataFrame``.
gdf.plot(ax=ax, color='red',alpha=0.1,markersize=0.5)

plt.show()
In [266]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in miles 3956. Use 6371 for kilometers
    return c * r

def distance_wrapper(point1,point2):
    
    return haversine(point1.x, point1.y, point2.x, point2.y)
In [267]:
gdf['distance'] = gdf.apply(lambda x: distance_wrapper(x['geometry'],x['profile_location']),axis=1)
gdf['line'] = gdf.apply(lambda x: LineString([[x['geometry'].x,x['geometry'].y],
                                              [x['profile_location'].x,x['profile_location'].y]]),
                                                axis=1)
gdf['line'] = gpd.GeoSeries(gdf['line'])
In [268]:
distance_list = []
date_list = []

for user_id in gdf['user_id'].unique():
    gdf_uix = gdf[gdf['user_id']==user_id].centroid.x.median()
    gdf_uiy = gdf[gdf['user_id']==user_id].centroid.y.median()
    point = Point(gdf_uix,gdf_uiy)
    distance = distance_wrapper(gdf[gdf['user_id']==user_id]['profile_location'].values[0],
                        point)
    distance_list.append(distance)
    
    date = gdf[gdf['user_id']==user_id]['tweet_created_at'].mean()
    date_list.append(date.year+(date.month/12))
<ipython-input-268-cee2994a3592>:5: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

  gdf_uix = gdf[gdf['user_id']==user_id].centroid.x.median()
<ipython-input-268-cee2994a3592>:6: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.

  gdf_uiy = gdf[gdf['user_id']==user_id].centroid.y.median()
In [236]:
gdf[gdf['user_id']==15073985][gdf['lat']<-74]
Out[236]:
profile_location user_id lat lng tweet_created_at geometry distance line
2525263 POINT (-74.08000 42.62000) 15073985 -74.299964 40.689904 2014-07-01 13:36:39 POINT (-74.29996 40.68990) 215.393280 LINESTRING (-74.29996 40.68990, -74.08000 42.6...
2525269 POINT (-74.08000 42.62000) 15073985 -74.207440 40.834662 2014-07-01 13:36:39 POINT (-74.20744 40.83466) 198.801958 LINESTRING (-74.20744 40.83466, -74.08000 42.6...
2525270 POINT (-74.08000 42.62000) 15073985 -74.005644 40.712151 2014-07-01 13:36:39 POINT (-74.00564 40.71215) 212.233017 LINESTRING (-74.00564 40.71215, -74.08000 42.6...
2525275 POINT (-74.08000 42.62000) 15073985 -74.011653 40.716689 2014-07-01 13:36:39 POINT (-74.01165 40.71669) 211.714633 LINESTRING (-74.01165 40.71669, -74.08000 42.6...
2525277 POINT (-74.08000 42.62000) 15073985 -74.004542 40.628625 2014-07-01 13:36:39 POINT (-74.00454 40.62862) 221.519578 LINESTRING (-74.00454 40.62862, -74.08000 42.6...
2525280 POINT (-74.08000 42.62000) 15073985 -74.003997 40.737713 2014-07-01 13:36:39 POINT (-74.00400 40.73771) 209.395892 LINESTRING (-74.00400 40.73771, -74.08000 42.6...
2525289 POINT (-74.08000 42.62000) 15073985 -74.199128 40.862479 2014-07-01 13:36:39 POINT (-74.19913 40.86248) 195.677161 LINESTRING (-74.19913 40.86248, -74.08000 42.6...
2525294 POINT (-74.08000 42.62000) 15073985 -74.233758 40.733609 2014-07-01 13:36:39 POINT (-74.23376 40.73361) 210.145347 LINESTRING (-74.23376 40.73361, -74.08000 42.6...
2525295 POINT (-74.08000 42.62000) 15073985 -74.166830 40.734600 2014-07-01 13:36:39 POINT (-74.16683 40.73460) 209.770868 LINESTRING (-74.16683 40.73460, -74.08000 42.6...
2525297 POINT (-74.08000 42.62000) 15073985 -74.233849 40.728056 2014-07-01 13:36:39 POINT (-74.23385 40.72806) 210.762165 LINESTRING (-74.23385 40.72806, -74.08000 42.6...
2525299 POINT (-74.08000 42.62000) 15073985 -74.233849 40.728056 2014-07-01 13:36:39 POINT (-74.23385 40.72806) 210.762165 LINESTRING (-74.23385 40.72806, -74.08000 42.6...
2525301 POINT (-74.08000 42.62000) 15073985 -74.233849 40.728056 2014-07-01 13:36:39 POINT (-74.23385 40.72806) 210.762165 LINESTRING (-74.23385 40.72806, -74.08000 42.6...
2525302 POINT (-74.08000 42.62000) 15073985 -74.233849 40.728056 2014-07-01 13:36:39 POINT (-74.23385 40.72806) 210.762165 LINESTRING (-74.23385 40.72806, -74.08000 42.6...
2525310 POINT (-74.08000 42.62000) 15073985 -97.093220 32.747868 2014-07-01 13:36:39 POINT (-97.09322 32.74787) 2292.828136 LINESTRING (-97.09322 32.74787, -74.08000 42.6...
2525314 POINT (-74.08000 42.62000) 15073985 -74.233849 40.728056 2014-07-01 13:36:39 POINT (-74.23385 40.72806) 210.762165 LINESTRING (-74.23385 40.72806, -74.08000 42.6...
2525322 POINT (-74.08000 42.62000) 15073985 -74.233849 40.728056 2014-07-01 13:36:39 POINT (-74.23385 40.72806) 210.762165 LINESTRING (-74.23385 40.72806, -74.08000 42.6...
2525323 POINT (-74.08000 42.62000) 15073985 -74.233849 40.728056 2014-07-01 13:36:39 POINT (-74.23385 40.72806) 210.762165 LINESTRING (-74.23385 40.72806, -74.08000 42.6...
2525332 POINT (-74.08000 42.62000) 15073985 -74.159894 40.897537 2014-07-01 13:36:39 POINT (-74.15989 40.89754) 191.643729 LINESTRING (-74.15989 40.89754, -74.08000 42.6...
2525333 POINT (-74.08000 42.62000) 15073985 -74.007900 40.740331 2014-07-01 13:36:39 POINT (-74.00790 40.74033) 209.095404 LINESTRING (-74.00790 40.74033, -74.08000 42.6...
2525387 POINT (-74.08000 42.62000) 15073985 -74.033500 40.772759 2014-07-01 13:36:39 POINT (-74.03350 40.77276) 205.440097 LINESTRING (-74.03350 40.77276, -74.08000 42.6...
2525392 POINT (-74.08000 42.62000) 15073985 -74.008499 40.715658 2014-07-01 13:36:39 POINT (-74.00850 40.71566) 211.836416 LINESTRING (-74.00850 40.71566, -74.08000 42.6...
2525394 POINT (-74.08000 42.62000) 15073985 -74.150729 40.894972 2014-07-01 13:36:39 POINT (-74.15073 40.89497) 191.904040 LINESTRING (-74.15073 40.89497, -74.08000 42.6...
2525397 POINT (-74.08000 42.62000) 15073985 -74.150729 40.894972 2014-07-01 13:36:39 POINT (-74.15073 40.89497) 191.904040 LINESTRING (-74.15073 40.89497, -74.08000 42.6...
2525408 POINT (-74.08000 42.62000) 15073985 -74.233233 40.733438 2014-07-01 13:36:39 POINT (-74.23323 40.73344) 210.161683 LINESTRING (-74.23323 40.73344, -74.08000 42.6...
2525421 POINT (-74.08000 42.62000) 15073985 -74.233580 40.728047 2014-07-01 13:36:39 POINT (-74.23358 40.72805) 210.761811 LINESTRING (-74.23358 40.72805, -74.08000 42.6...
2525429 POINT (-74.08000 42.62000) 15073985 -74.233580 40.728047 2014-07-01 13:36:39 POINT (-74.23358 40.72805) 210.761811 LINESTRING (-74.23358 40.72805, -74.08000 42.6...
2525435 POINT (-74.08000 42.62000) 15073985 -74.000861 40.710324 2014-07-01 13:36:39 POINT (-74.00086 40.71032) 212.447984 LINESTRING (-74.00086 40.71032, -74.08000 42.6...
2525461 POINT (-74.08000 42.62000) 15073985 -74.006400 40.711200 2014-07-01 13:36:39 POINT (-74.00640 40.71120) 212.336881 LINESTRING (-74.00640 40.71120, -74.08000 42.6...
2525468 POINT (-74.08000 42.62000) 15073985 -74.000255 40.723969 2014-07-01 13:36:39 POINT (-74.00026 40.72397) 210.932969 LINESTRING (-74.00026 40.72397, -74.08000 42.6...
2525479 POINT (-74.08000 42.62000) 15073985 -74.007733 40.710228 2014-07-01 13:36:39 POINT (-74.00773 40.71023) 212.441742 LINESTRING (-74.00773 40.71023, -74.08000 42.6...
2525532 POINT (-74.08000 42.62000) 15073985 -74.005173 40.720610 2014-07-01 13:36:39 POINT (-74.00517 40.72061) 211.293911 LINESTRING (-74.00517 40.72061, -74.08000 42.6...
2525549 POINT (-74.08000 42.62000) 15073985 -74.007233 40.740265 2014-07-01 13:36:39 POINT (-74.00723 40.74026) 209.104324 LINESTRING (-74.00723 40.74026, -74.08000 42.6...
2525551 POINT (-74.08000 42.62000) 15073985 -74.007377 40.723962 2014-07-01 13:36:39 POINT (-74.00738 40.72396) 210.916051 LINESTRING (-74.00738 40.72396, -74.08000 42.6...
2525564 POINT (-74.08000 42.62000) 15073985 -74.022188 40.765118 2014-07-01 13:36:39 POINT (-74.02219 40.76512) 206.309303 LINESTRING (-74.02219 40.76512, -74.08000 42.6...
2525569 POINT (-74.08000 42.62000) 15073985 -74.124379 40.832768 2014-07-01 13:36:39 POINT (-74.12438 40.83277) 198.765245 LINESTRING (-74.12438 40.83277, -74.08000 42.6...
2525583 POINT (-74.08000 42.62000) 15073985 -74.006757 40.713115 2014-07-01 13:36:39 POINT (-74.00676 40.71312) 212.123177 LINESTRING (-74.00676 40.71312, -74.08000 42.6...
2525586 POINT (-74.08000 42.62000) 15073985 -74.000055 40.711733 2014-07-01 13:36:39 POINT (-74.00006 40.71173) 212.293467 LINESTRING (-74.00006 40.71173, -74.08000 42.6...
2525587 POINT (-74.08000 42.62000) 15073985 -74.000055 40.711733 2014-07-01 13:36:39 POINT (-74.00006 40.71173) 212.293467 LINESTRING (-74.00006 40.71173, -74.08000 42.6...
2525599 POINT (-74.08000 42.62000) 15073985 -74.233234 40.733531 2014-07-01 13:36:39 POINT (-74.23323 40.73353) 210.151365 LINESTRING (-74.23323 40.73353, -74.08000 42.6...
2525615 POINT (-74.08000 42.62000) 15073985 -74.233234 40.733531 2014-07-01 13:36:39 POINT (-74.23323 40.73353) 210.151365 LINESTRING (-74.23323 40.73353, -74.08000 42.6...
2525622 POINT (-74.08000 42.62000) 15073985 -74.252129 40.891066 2014-07-01 13:36:39 POINT (-74.25213 40.89107) 192.777988 LINESTRING (-74.25213 40.89107, -74.08000 42.6...
2525633 POINT (-74.08000 42.62000) 15073985 -74.233234 40.733531 2014-07-01 13:36:39 POINT (-74.23323 40.73353) 210.151365 LINESTRING (-74.23323 40.73353, -74.08000 42.6...
2525644 POINT (-74.08000 42.62000) 15073985 -74.151640 40.894779 2014-07-01 13:36:39 POINT (-74.15164 40.89478) 191.927815 LINESTRING (-74.15164 40.89478, -74.08000 42.6...
2525651 POINT (-74.08000 42.62000) 15073985 -74.406800 40.695700 2014-07-01 13:36:39 POINT (-74.40680 40.69570) 215.687382 LINESTRING (-74.40680 40.69570, -74.08000 42.6...
2525656 POINT (-74.08000 42.62000) 15073985 -74.011148 40.715300 2014-07-01 13:36:39 POINT (-74.01115 40.71530) 211.870157 LINESTRING (-74.01115 40.71530, -74.08000 42.6...
2525657 POINT (-74.08000 42.62000) 15073985 -74.011148 40.715300 2014-07-01 13:36:39 POINT (-74.01115 40.71530) 211.870157 LINESTRING (-74.01115 40.71530, -74.08000 42.6...
2525662 POINT (-74.08000 42.62000) 15073985 -74.374106 40.726197 2014-07-01 13:36:39 POINT (-74.37411 40.72620) 211.992927 LINESTRING (-74.37411 40.72620, -74.08000 42.6...
2525679 POINT (-74.08000 42.62000) 15073985 -74.161376 40.898408 2014-07-01 13:36:39 POINT (-74.16138 40.89841) 191.551236 LINESTRING (-74.16138 40.89841, -74.08000 42.6...
2525691 POINT (-74.08000 42.62000) 15073985 -74.213862 40.807954 2014-07-01 13:36:39 POINT (-74.21386 40.80795) 201.796372 LINESTRING (-74.21386 40.80795, -74.08000 42.6...
2525697 POINT (-74.08000 42.62000) 15073985 -74.080852 40.889711 2014-07-01 13:36:39 POINT (-74.08085 40.88971) 192.399349 LINESTRING (-74.08085 40.88971, -74.08000 42.6...
2525726 POINT (-74.08000 42.62000) 15073985 -74.233532 40.728029 2014-07-01 13:36:39 POINT (-74.23353 40.72803) 210.763623 LINESTRING (-74.23353 40.72803, -74.08000 42.6...
2525731 POINT (-74.08000 42.62000) 15073985 -74.152929 40.896023 2014-07-01 13:36:39 POINT (-74.15293 40.89602) 191.792882 LINESTRING (-74.15293 40.89602, -74.08000 42.6...
2525734 POINT (-74.08000 42.62000) 15073985 -74.168189 40.900518 2014-07-01 13:36:39 POINT (-74.16819 40.90052) 191.337534 LINESTRING (-74.16819 40.90052, -74.08000 42.6...
2525735 POINT (-74.08000 42.62000) 15073985 -74.125851 40.731807 2014-07-01 13:36:39 POINT (-74.12585 40.73181) 209.991982 LINESTRING (-74.12585 40.73181, -74.08000 42.6...
2525736 POINT (-74.08000 42.62000) 15073985 -74.139375 40.715064 2014-07-01 13:36:39 POINT (-74.13937 40.71506) 211.876654 LINESTRING (-74.13937 40.71506, -74.08000 42.6...
2525741 POINT (-74.08000 42.62000) 15073985 -74.180981 40.865296 2014-07-01 13:36:39 POINT (-74.18098 40.86530) 195.293952 LINESTRING (-74.18098 40.86530, -74.08000 42.6...
2525742 POINT (-74.08000 42.62000) 15073985 -74.233276 40.796836 2014-07-01 13:36:39 POINT (-74.23328 40.79684) 203.125379 LINESTRING (-74.23328 40.79684, -74.08000 42.6...
2525744 POINT (-74.08000 42.62000) 15073985 -74.207079 40.833895 2014-07-01 13:36:39 POINT (-74.20708 40.83389) 198.885570 LINESTRING (-74.20708 40.83389, -74.08000 42.6...
2525758 POINT (-74.08000 42.62000) 15073985 -74.213419 40.886914 2014-07-01 13:36:39 POINT (-74.21342 40.88691) 193.027866 LINESTRING (-74.21342 40.88691, -74.08000 42.6...

Distance between User Reported Location and Median GPS coordinates

In [270]:
bins = np.logspace(0,4.5,20)
binsy = np.linspace(2009,2020,12*4)
f,ax = plt.subplots(figsize=(12,7),facecolor='w')
cbar = ax.hist2d(distance_list,date_list,bins=[bins,binsy],density=False)
ax.set_xscale('log')
ax.set_xlabel('Distance between Median phone (GPS) & \n Self-reported location (bio) \n [km]')
plt.colorbar(cbar[3])
Out[270]:
<matplotlib.colorbar.Colorbar at 0x7f83df8b5c70>
In [271]:
bins = np.logspace(0,4.5,20)
f,ax = plt.subplots(figsize=(12,7),facecolor='w')
ax.hist(distance_list,bins=bins,density=False)
ax.set_xscale('log')
ax.set_xlabel('Distance between Median phone (GPS) & \n Self-reported location (bio) \n [km]')
Out[271]:
Text(0.5, 0, 'Distance between Median phone (GPS) & \n Self-reported location (bio) \n [km]')
In [272]:
distance_array = np.array(distance_list)
account_total = np.count_nonzero(distance_array)
dist_list = [1,5,10,13,50,100,200,500]
for dist in dist_list:
    accounts_gt = np.count_nonzero(distance_array[distance_array < dist])
    print(f"{(accounts_gt*100/account_total):.1f}% user's median tweet location within {dist}km of self-reported account location")
4.0% user's median tweet location within 1km of self-reported account location
30.1% user's median tweet location within 5km of self-reported account location
50.7% user's median tweet location within 10km of self-reported account location
58.8% user's median tweet location within 13km of self-reported account location
79.4% user's median tweet location within 50km of self-reported account location
82.4% user's median tweet location within 100km of self-reported account location
85.6% user's median tweet location within 200km of self-reported account location
90.3% user's median tweet location within 500km of self-reported account location

Histograms of distance from city to tweet GPS

While a median location is given above, here we show the tweet level distribution. Unfortunately the user generated location is not re-queried at each step leading to a potentially large source of error if users move and update their bio location frequently.

In [273]:
bins = np.logspace(0,4.5,20)
ax =gdf['distance'].hist(bins=bins,density=False,figsize=(12,8),grid=False)
ax.set_xscale('log')
ax.set_xlabel('Distance from Tweet to User\'s given Home \n[km]')
Out[273]:
Text(0.5, 0, "Distance from Tweet to User's given Home \n[km]")
In [276]:
bins = np.logspace(0,4.5,50)
binsy = np.linspace(2009,2020,12*6)
f,ax = plt.subplots(figsize=(12,7),facecolor='w')
dates = np.array([i.year for i in gdf['tweet_created_at']])+np.array([(i.month-1)/12 for i in gdf['tweet_created_at']])
cbar = ax.hist2d(gdf['distance'],dates,bins=[bins,binsy],density=False)
ax.set_xscale('log')
ax.set_xlabel('Distance between phone location (GPS) & \n Self-reported location (bio) \n [km]')
plt.colorbar(cbar[3])
Out[276]:
<matplotlib.colorbar.Colorbar at 0x7f8351a170a0>
In [ ]: